Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
debakarr
GitHub Repository: debakarr/machinelearning
Path: blob/master/Part 9 - Dimension Reduction/Principal Component Analysis/[Python] Principal Component Analysis.ipynb
1336 views
Kernel: Python 3

Principal Component Analysis

Data preprocessing

# Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split # for training and testing split from sklearn.preprocessing import StandardScaler # for Feature scaling from sklearn.decomposition import PCA # for applying PCA from sklearn.linear_model import LogisticRegression # for classifier from sklearn.metrics import confusion_matrix # for making confusion matrix from matplotlib.colors import ListedColormap # for visualization %matplotlib inline plt.rcParams['figure.figsize'] = [14, 8] # Importing the dataset dataset = pd.read_csv('Wine.csv') X = dataset.iloc[:, 0:13].values y = dataset.iloc[:, 13].values
dataset.head(5)
dataset.tail(5)
np.set_printoptions(suppress=True, threshold=13)
X[0]
array([ 14.23, 1.71, 2.43, 15.6 , 127. , 2.8 , 3.06, 0.28, 2.29, 5.64, 1.04, 3.92, 1065. ])
# Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Feature Scaling sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test)

Applying Pricipal Componene Analysism

First lets see the variance by the 13 independent variables. For this we need to create an object of PCA with n_components parameter as None

pca = PCA(n_components = None) pca.fit_transform(X_train) X_test = pca.transform(X_test) explained_variance = pca.explained_variance_ratio_

Then lets print explained variance ratio.

print(explained_variance) array([ 0.35900066, 0.18691934, 0.11606557, 0.07371716, 0.0665386 , 0.04854582, 0.04195042, 0.02683922, 0.0234746 , 0.01889734, 0.01715943, 0.01262928, 0.00826257])

This shows principal component that explain the variance in decending order.

Here the 1st principal component expalains 36% of the variance. top two expalins 55% (i.e. 36%+19%) of the variance. Now we need to select the first 2 principal components.

pca = PCA(n_components = 2) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) explained_variance = pca.explained_variance_ratio_
explained_variance
array([ 0.35900066, 0.18691934])
X_test[0]
array([ 2.06784347, -1.02818265])
X_train[0]
array([-1.16602698, -3.61532732])

Fitting Logistic Regression to the Training Set

classifier = LogisticRegression(random_state = 42) classifier.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=42, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

Predicting Test set result

y_pred = classifier.predict(X_test)
y_pred[0:10]
array([1, 1, 3, 1, 2, 1, 2, 3, 2, 3])
y_test[0:10]
array([1, 1, 3, 1, 2, 1, 2, 3, 2, 3])

Making the Confusion Matrix

cm = confusion_matrix(y_test, y_pred) cm
array([[14, 0, 0], [ 0, 14, 0], [ 0, 0, 8]])

Here we have almost no incorrect prediction at all.

Accuracy

(cm[0][0] + cm[1][1] + cm[2][2])/ np.sum(cm)
1.0

Visualizing the training set results

X_set, y_set = X_train, y_train X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() +1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() +1, step = 0.01)) # plot the contour plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue'))) # plot the points plt.xlim(X1.min(), X1.max()) plt.ylim(X2.min(), X2.max()) for i, j in enumerate(np.unique(y_set)): plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green', 'blue'))(i), label = j, edgecolors = 'white', linewidth = 0.7) plt.title('Logistic Regression (Training set)') plt.xlabel('1st Principal Component') plt.ylabel('2nd Principal Component') plt.legend()
<matplotlib.legend.Legend at 0x7f75a22c4a58>
Image in a Jupyter notebook

Visualizing the test set results

X_set, y_set = X_test, y_test X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() +1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() +1, step = 0.01)) # plot the contour plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue'))) # plot the points plt.xlim(X1.min(), X1.max()) plt.ylim(X2.min(), X2.max()) for i, j in enumerate(np.unique(y_set)): plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green', 'blue'))(i), label = j, edgecolors = 'white', linewidth = 0.7) plt.title('Logistic Regression (Training set)') plt.xlabel('1st Principal Component') plt.ylabel('2nd Principal Component') plt.legend()
<matplotlib.legend.Legend at 0x7f75a1294780>
Image in a Jupyter notebook